load("~/Documents/MiGASti/Databases/gene_matrix.RData")
metadata <- read.table("~/Documents/MiGASti/Databases/metadata.txt")
#set rownames to Sample
row.names(metadata) <- metadata$Sample 
setwd("~/Documents/MiGASti/Databases")
#exclude samples that did not pass QC filtering
exclude <- read.table("samples2remove.txt")
exclude <- exclude$x
genes_counts_filt = genes_counts[, !colnames(genes_counts) %in% exclude] 
#Excludes the samples from filters. 
#dim(genes_counts_filt)
metadata_filt = metadata[ !(rownames(metadata) %in% exclude), ]
length(metadata_filt)
gencode_30 = read.table("~/Documents/MiGASti/Databases/ens.geneid.gencode.v30")
colnames(gencode_30) = c("ensembl","symbol")
#order metadata and genes counts
genes_counts_ordered <- genes_counts_filt[,rownames(metadata_filt)]
#head(genes_counts_ordered)
all(rownames(metadata_filt) == colnames (genes_counts_ordered)) #TRUE

Preparing the samples for DEG

#remove uncultured samples
metadata_cultured <- metadata_filt[metadata_filt$Stimulation != "ununstim",]
#check numbers
#dim(metadata_cultured)
#check numbers per stimulation
#table(metadata_filt$Stimulation)
#select only CC samples in metadata
metadata_CC = metadata_cultured[metadata_cultured$Region=="CC",]
#select only CC samples in genes counts
genes_counts_CC <- genes_counts_ordered[,metadata_CC$Sample]
#order metadata and genes_counts
genes_counts_CC_ordered <- genes_counts_CC[,rownames(metadata_CC)]
#check ordering
all(rownames(metadata_CC) == colnames (genes_counts_CC_ordered)) #TRUE

[1] TRUE

#round counts; deseq2 can only handle integers
genes_counts_CC_ordered <- round(genes_counts_CC_ordered, digits=0)

#make sure covariate variables are the right format 
#cannot create dds object with numeric values
metadata_CC$Donor_id <- as.factor(metadata_CC$Donor_id)
metadata_CC$age <- as.integer(metadata_CC$age)
metadata_CC$sex <- as.factor(metadata_CC$sex)
metadata_CC$Stimulation <- as.factor(metadata_CC$Stimulation)
metadata_CC$picard_pct_ribosomal_bases = scale(metadata_CC$picard_pct_ribosomal_bases)
metadata_CC$picard_pct_mrna_bases = scale(metadata_CC$picard_pct_mrna_bases)
metadata_CC$picard_pct_pf_reads_aligned = scale(metadata_CC$picard_pct_pf_reads_aligned)
metadata_CC$picard_pct_percent_duplication = scale(metadata_CC$picard_pct_percent_duplication)

#adjust for: ~ age + (1|donor_id) + picard_pct_ribosomal_bases + picard_pct_mrna_bases +   picard_pct_percent_duplication + picard_pct_pf_reads_aligned 
table(metadata_CC$Stimulation)

IFNy LPS unstim 10 16 15

DESeq2 of CC samples

#createDeSEQ2 object for LPS
dds <- DESeqDataSetFromMatrix(countData = genes_counts_CC_ordered,
                              colData = metadata_CC,
                              design = ~ age + sex + picard_pct_ribosomal_bases + picard_pct_mrna_bases + picard_pct_percent_duplication + picard_pct_pf_reads_aligned + Stimulation) 
#variable of interest at end of the formula

#Make sure that control group is set as the reference group
dds$Stimulation <- relevel(dds$Stimulation, ref="unstim")
#head(dds)

#filter: CPM > 1 in 50% of the samples 
keep.exp = rowSums(cpm(genes_counts_CC_ordered) > 1) >= 0.5*ncol(genes_counts_CC_ordered)
dds = dds[keep.exp,]

#Run differential expression 
dds <- DESeq(dds, betaPrior = FALSE)
resultsNames(dds)

[1] “Intercept” “age”
[3] “sex_m_vs_f” “picard_pct_ribosomal_bases”
[5] “picard_pct_mrna_bases” “picard_pct_percent_duplication” [7] “picard_pct_pf_reads_aligned” “Stimulation_IFNy_vs_unstim”
[9] “Stimulation_LPS_vs_unstim”

DESeq2: LPS vs unstim

Number of differentially expressed genes

# generate results table for LPS vs unstim
res_LPS <- results(dds, name="Stimulation_LPS_vs_unstim")
sum(res_LPS$padj < 0.05, na.rm=TRUE)

[1] 90

resOrdered_LPS <- res_LPS[order(res_LPS$pvalue),] 
resOrdered_LPS <- as.data.frame(resOrdered_LPS)

Volcano plot LPS vs unstim

head(res_LPS)

log2 fold change (MLE): Stimulation LPS vs unstim Wald test p-value: Stimulation LPS vs unstim DataFrame with 6 rows and 6 columns baseMean log2FoldChange lfcSE stat pvalue ENSG00000000003.14 21.7049 -3.42480e-01 0.432788 -0.791333912 0.428749 ENSG00000000419.12 153.6174 -2.18683e-01 0.161789 -1.351657585 0.176485 ENSG00000000457.14 89.1186 1.35362e-01 0.178469 0.758462602 0.448174 ENSG00000000460.17 24.0644 1.54539e-01 0.246986 0.625699623 0.531512 ENSG00000000938.13 501.9808 1.83077e-01 0.200124 0.914816234 0.360288 ENSG00000000971.15 45.4872 -9.08342e-05 0.278618 -0.000326017 0.999740 padj ENSG00000000003.14 0.952358 ENSG00000000419.12 0.843696 ENSG00000000457.14 0.955951 ENSG00000000460.17 0.963349 ENSG00000000938.13 0.933233 ENSG00000000971.15 0.999938

with(res_LPS, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-2.5,2)))
with(subset(res_LPS, padj<.05 ), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))

MA plot LPS vs unstim

#The function plotMA shows the log2 fold changes attributable to a given variable over the mean of normalized counts for all the samples in the DESeqDataSet. Points will be colored if the adjusted p value is less than 0.1. Points which fall out of the window are plotted as open triangles pointing either up or down.

plotMA(res_LPS, ylim=c(-2,2))

TOP differentially expressed genes LPS vs unstim

setDT(resOrdered_LPS, keep.rownames = "ensembl")
resOrdered_LPS <- left_join(resOrdered_LPS, gencode_30, by = "ensembl")
resOrdered_LPS_top = resOrdered_LPS[order(resOrdered_LPS$padj) ,]
setDT(resOrdered_LPS_top, keep.rownames = "ensembl")
resOrdered_LPS_top = resOrdered_LPS_top[, c("ensembl", "symbol", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
createDT(resOrdered_LPS_top)
write.table(resOrdered_LPS_top, "DEG_LPS_CC.txt")

DESeq2: IFNy vs unstim

Number of differentially expressed genes

# generate results table for IFNy vs unstim
res_IFNy <- results(dds, name="Stimulation_IFNy_vs_unstim")
sum(res_IFNy$padj < 0.05, na.rm=TRUE)

[1] 76

resOrdered_IFNy <- res_IFNy[order(res_IFNy$pvalue),] 
resOrdered_IFNy <- as.data.frame(resOrdered_IFNy)

Volcano plot IFNy vs unstim

head(res_IFNy)

log2 fold change (MLE): Stimulation IFNy vs unstim Wald test p-value: Stimulation IFNy vs unstim DataFrame with 6 rows and 6 columns baseMean log2FoldChange lfcSE stat pvalue ENSG00000000003.14 21.7049 0.0310747 0.538301 0.0577273 0.95396583 ENSG00000000419.12 153.6174 -0.0599452 0.200484 -0.2990026 0.76493810 ENSG00000000457.14 89.1186 0.6415111 0.219406 2.9238550 0.00345726 ENSG00000000460.17 24.0644 0.2143219 0.303585 0.7059705 0.48020647 ENSG00000000938.13 501.9808 0.2436724 0.249487 0.9766932 0.32872106 ENSG00000000971.15 45.4872 0.6430284 0.346537 1.8555821 0.06351314 padj ENSG00000000003.14 NA ENSG00000000419.12 0.989279 ENSG00000000457.14 0.248879 ENSG00000000460.17 NA ENSG00000000938.13 0.957065 ENSG00000000971.15 0.852141

with(res_IFNy, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-10,10)))
with(subset(res_IFNy, padj<.05 ), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))

MA plot IFNy vs unstim

plotMA(res_IFNy, ylim=c(-2,2))

TOP differentially expressed genes IFNy vs unstim

setDT(resOrdered_IFNy, keep.rownames = "ensembl")
resOrdered_IFNy <- merge(resOrdered_IFNy, gencode_30, by = "ensembl")
resOrdered_IFNy_top = resOrdered_IFNy[order(resOrdered_IFNy$padj) ,]
setDT(resOrdered_IFNy_top, keep.rownames = "ensembl")
resOrdered_IFNy_top = resOrdered_IFNy_top[, c("ensembl", "symbol", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
createDT(resOrdered_IFNy_top)
write.table(resOrdered_IFNy_top, "DEG_IFNy_CC.txt")

#Create genelists with Log2FC < 1 and < -1 for different stimuli

resOrdered_LPS_p <- subset(resOrdered_LPS_top, padj < 0.05)
resOrdered_LPS_LFC <- subset(resOrdered_LPS_p, log2FoldChange > 1 | log2FoldChange < -1)
write.table(resOrdered_LPS_LFC, "LPS_CC_FC1.txt")

resOrdered_IFNy_p <- subset(resOrdered_IFNy_top, padj < 0.05)
resOrdered_IFNy_LFC <- subset(resOrdered_IFNy_p, log2FoldChange > 1 | log2FoldChange < -1)
write.table(resOrdered_IFNy_LFC, "IFNy_CC_FC1.txt")